import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import plotly
import datetime as dt
df = pd.read_csv('flightdata.csv')
print(len(df))
df.head()
28996563
| callsign | origin | destination | day | |
|---|---|---|---|---|
| 0 | CES597 | YBBN | CYVR | 2019-10-01 00:00:00+00:00 |
| 1 | ETH3640 | VHHH | EBMB | 2019-10-01 00:00:00+00:00 |
| 2 | ETH628 | KEWR | VTBS | 2019-10-01 00:00:00+00:00 |
| 3 | AMX023 | SAEZ | LEMD | 2019-10-01 00:00:00+00:00 |
| 4 | RAM208 | SBGR | CYUL | 2019-10-01 00:00:00+00:00 |
id_count = df['origin'].value_counts()
id_df = pd.DataFrame(id_count)
id_df = id_df[id_df.origin > 10000]
id_df.head(20)
| origin | |
|---|---|
| KORD | 512214 |
| KATL | 480633 |
| KDFW | 405724 |
| KDEN | 384610 |
| KLAX | 376336 |
| KLAS | 322010 |
| KPHX | 311131 |
| KCLT | 309693 |
| KSEA | 289971 |
| KSFO | 230016 |
| KBOS | 228509 |
| KPHL | 223126 |
| KMSP | 220727 |
| KEWR | 212623 |
| EHAM | 205968 |
| KMIA | 205157 |
| EDDF | 204575 |
| KJFK | 198515 |
| KDTW | 193545 |
| LFPG | 190589 |
id_count = df['destination'].value_counts()
id_df = pd.DataFrame(id_count)
id_df = id_df[id_df.destination > 10000]
id_df.head(20)
| destination | |
|---|---|
| KORD | 483433 |
| KATL | 432216 |
| KDFW | 353756 |
| KLAX | 337861 |
| KLAS | 303932 |
| KPHX | 296740 |
| KSEA | 278417 |
| KCLT | 269319 |
| KDEN | 264810 |
| KSFO | 222455 |
| KBOS | 219651 |
| KPHL | 207955 |
| KEWR | 201202 |
| KMIA | 196539 |
| KJFK | 195086 |
| EHAM | 192198 |
| EDDF | 191562 |
| KMSP | 187764 |
| EGLL | 186479 |
| LFPG | 186219 |
It could be observed that the top 20 airports with the most departing and arriving flights doesn't vary much. So to simplify the problem, the airports are chosen based on the number of departing flights.
First, a general study on the number of flights departing and arriving at each airport is done. The first 20 airports with the most departing flights are shown below. It could be observed that the top 10 airports with most departing flights are all US airports, whose ICAO code starts with 'K'. xxx So three areas are divided to analyse...
# America without US
df_Ame_ori = df[df['origin'].str.startswith('B')
| df['origin'].str.startswith('C')
| df['origin'].str.startswith('PA')
# | df['origin'].str.startswith('K')
| df['origin'].str.startswith('M')
| df['origin'].str.startswith('T')
| df['origin'].str.startswith('S')]
df_Ame_ori.head(10)
| callsign | origin | destination | day | |
|---|---|---|---|---|
| 3 | AMX023 | SAEZ | LEMD | 2019-10-01 00:00:00+00:00 |
| 4 | RAM208 | SBGR | CYUL | 2019-10-01 00:00:00+00:00 |
| 11 | CMP382 | SBGR | KSFO | 2019-10-01 00:00:00+00:00 |
| 28 | AVA020 | SBGR | KJFK | 2019-10-01 00:00:00+00:00 |
| 71 | CFG235 | MROC | EDDF | 2019-10-01 00:00:00+00:00 |
| 73 | AIJ2820 | CYVR | CYUL | 2019-10-01 00:00:00+00:00 |
| 87 | WJA2311 | CYYZ | CYYC | 2019-10-01 00:00:00+00:00 |
| 92 | ACA015 | CYYZ | VHHH | 2019-10-01 00:00:00+00:00 |
| 102 | NRS7506 | SAEZ | EGKK | 2019-10-01 00:00:00+00:00 |
| 112 | CMP470 | SAEZ | CYYZ | 2019-10-01 00:00:00+00:00 |
#US
df_US_ori = df[df['origin'].str.startswith('K')]
df_US_ori.head(10)
| callsign | origin | destination | day | |
|---|---|---|---|---|
| 2 | ETH628 | KEWR | VTBS | 2019-10-01 00:00:00+00:00 |
| 6 | QFA7554 | KORD | YMML | 2019-10-01 00:00:00+00:00 |
| 10 | UAE9252 | KORD | YSSY | 2019-10-01 00:00:00+00:00 |
| 15 | SIA37 | KLAX | WSSS | 2019-10-01 00:00:00+00:00 |
| 21 | CCA933 | KLAX | LFPG | 2019-10-01 00:00:00+00:00 |
| 33 | EVA619 | KLAX | RCTP | 2019-10-01 00:00:00+00:00 |
| 47 | CAL5167 | KLAX | RCTP | 2019-10-01 00:00:00+00:00 |
| 63 | AAR281 | KLAX | RKSI | 2019-10-01 00:00:00+00:00 |
| 82 | N191WT | KFTY | KSAN | 2019-10-01 00:00:00+00:00 |
| 90 | N4308S | KSTS | KSUS | 2019-10-01 00:00:00+00:00 |
id_AO_count = df_Ame_ori['origin'].value_counts()
id_df_AO = pd.DataFrame(id_AO_count)
id_df_AO = id_df_AO[id_df_AO.origin > 10000]
id_df_AO.head(10)
| origin | |
|---|---|
| CYYZ | 134386 |
| PANC | 119452 |
| CYVR | 86232 |
| CYUL | 64690 |
| CYYC | 64222 |
| SBGR | 52522 |
| TJSJ | 40115 |
| SBBR | 31673 |
| SBSP | 30818 |
| SBRJ | 29866 |
id_USO_count = df_US_ori['origin'].value_counts()
id_df_USO = pd.DataFrame(id_USO_count)
id_df_USO = id_df_USO[id_df_USO.origin > 10000]
id_df_USO.head(10)
| origin | |
|---|---|
| KORD | 512214 |
| KATL | 480633 |
| KDFW | 405724 |
| KDEN | 384610 |
| KLAX | 376336 |
| KLAS | 322010 |
| KPHX | 311131 |
| KCLT | 309693 |
| KSEA | 289971 |
| KSFO | 230016 |
Five US airports and five other airports in the Americas which have the most departing flights are under consideration: KORD, KATL, KDFW, KDEN, KLAX, CYYZ, PANC, CYVR, CYUL, CYYC.
df_Am = df[(df['origin'] == 'KORD') |
(df['origin'] == 'KATL') |
(df['origin'] == 'KDFW') |
(df['origin'] == 'KDEN') |
(df['origin'] == 'KLAX') |
(df['origin'] == 'CYYX') |
(df['origin'] == 'PANC') |
(df['origin'] == 'CYVR') |
(df['origin'] == 'CYUL') |
(df['origin'] == 'CYYC') ]
df_Am.head()
len(df_Am)
2494113
df_group = df_Am.groupby(['origin', 'day'], as_index=False).count()
df_group.head()
| origin | day | callsign | destination | |
|---|---|---|---|---|
| 0 | CYUL | 2019-10-01 00:00:00+00:00 | 156 | 156 |
| 1 | CYUL | 2019-10-02 00:00:00+00:00 | 173 | 173 |
| 2 | CYUL | 2019-10-03 00:00:00+00:00 | 183 | 183 |
| 3 | CYUL | 2019-10-04 00:00:00+00:00 | 192 | 192 |
| 4 | CYUL | 2019-10-05 00:00:00+00:00 | 164 | 164 |
fig = px.line(df_group , x="day", y="callsign", color="origin")
fig.update_xaxes(title={'text': 'Time'})
fig.update_yaxes(title={'text': 'Number of flights'})
fig.show()